Ivonne V. Yanez Mendoza
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, ShuffleSplit, LeaveOneOut, StratifiedKFold, cross_val_score, cross_val_predict, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score,classification_report, f1_score, roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
from sklearn.ensemble import AdaBoostClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Normalizer, Binarizer, RobustScaler, OneHotEncoder, LabelEncoder, PowerTransformer, QuantileTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore")
from statistics import mean
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score
from sklearn.metrics import make_scorer
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import lightgbm as lgb
from sklearn.tree import DecisionTreeClassifier
import plotly.express as px
import plotly
from sklearn.model_selection import learning_curve
import pickle
from xgboost import XGBClassifier
%run auxiliar.py
# semilla
seed = 12345
# Lectura de los datos
data = pd.read_csv("../data/superstore_clean.csv")
df_limpio = data.copy()
# list(data.columns.values)
# X = predictors
X = df_limpio.drop("Response", axis = 1)
y = df_limpio.Response
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = seed, stratify = y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((1565, 28), (671, 28), (1565,), (671,))
El archivo auxiliar.py contiene funciones que se han desarrollado para evitar redundancia de codigo, ademas de mostrar un espacio de trabajo limpio.
algoritmos = [("Logistica", LogisticRegression()),
("Arbol de decision", DecisionTreeClassifier(random_state = seed)),
('AdaBoost', AdaBoostClassifier(random_state = seed)),
('CatBoost', CatBoostClassifier(logging_level='Silent')),
("LightGBM", LGBMClassifier(random_state=seed)),
("Random forest", RandomForestClassifier(random_state=seed)),
("Xgboost", XGBClassifier(random_state=seed)),
("GBM", GradientBoostingClassifier(random_state=seed))]
# Con cross validation
modelos_ml(algoritmos, X_train, y_train)
F1 score Logistica: 0.40239649036089525 F1 score Arbol de decision: 0.4218812418584455 F1 score AdaBoost: 0.4791690673650516 F1 score CatBoost: 0.435130556105217 F1 score LightGBM: 0.43162053423839686 F1 score Random forest: 0.31931911935482193 F1 score Xgboost: 0.4859999051284272 F1 score GBM: 0.43927272727272726
# F1 inicial en test
modelos_pred(algoritmos, X_train, y_train, X_test, y_test)
Logistica: 0.27480916030534347 Arbol de decision: 0.41584158415841577 AdaBoost: 0.48447204968944096 CatBoost: 0.40845070422535207 LightGBM: 0.47133757961783446 Random forest: 0.28346456692913385 Xgboost: 0.4528301886792453 GBM: 0.3835616438356165
Como primera aproximacion, los que podrian funcionar mejor para este estudio es un AdaBoost, LightGBM o un Xgboost pero es preciso mirar en detalle pues son un conjunto de metricas y situaciones a evaluar antes de elegir un modelo
# Definir la metrica a utilizar
scorer = metrics.make_scorer(metrics.f1_score)
%%time
model = DecisionTreeClassifier(random_state=seed)
# Estandarizar
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Standard scaler', scaler),
('Modelo', model)])
# tunning
parameters = {
"Modelo__max_depth": [10, 30, 5],
"Modelo__min_samples_leaf": [3, 5, 7],
"Modelo__max_leaf_nodes": [2, 3, 5],
"Modelo__min_impurity_decrease": [0.0001, 0.001],
}
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
destree = GridSearchCV(estimator = pipeline, param_grid = parameters, scoring = scorer, cv = cv, n_jobs = -1)
destree.fit(X_train,y_train)
# scorer: recall
print(f'Mejores parametros: {destree.best_params_}')
print(f'Mejor score: {destree.best_score_}')
Mejores parametros: {'Modelo__max_depth': 10, 'Modelo__max_leaf_nodes': 5, 'Modelo__min_impurity_decrease': 0.0001, 'Modelo__min_samples_leaf': 3}
Mejor score: 0.09555965145152089
CPU times: user 513 ms, sys: 210 ms, total: 724 ms
Wall time: 4.04 s
## Construyo modelo con los parametros tuneados
detree = DecisionTreeClassifier(
max_depth = 10,
max_leaf_nodes = 5,
min_impurity_decrease = 0.0001,
min_samples_leaf = 3,
random_state = seed
)
detree.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=10, max_leaf_nodes=5,
min_impurity_decrease=0.0001, min_samples_leaf=3,
random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. DecisionTreeClassifier(max_depth=10, max_leaf_nodes=5,
min_impurity_decrease=0.0001, min_samples_leaf=3,
random_state=12345)# Resultados en train
res_dttrain = reportes(detree, X_train, y_train, nombre = "decision train")
res_dttrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| decision train | 0.8569 | 0.0513 | 0.5249 | 0.8571 | 0.0968 |
# Resultados en train
res_dttest = reportes(detree, X_test, y_test, nombre = "decision test")
res_dttest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| decision test | 0.8525 | 0.03 | 0.5132 | 0.6 | 0.0571 |
# Muestra la matriz
confusion(detree, X_test, y_test)
Con gridsearchcv
%%time
# n_jobs=-1 para que ocupe todos los cores
model = RandomForestClassifier(random_state = seed)
# Transformador
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Standard scaler', scaler),
('Modelo', model)])
params = {
'Modelo__n_estimators': [200,300,400,500],
'Modelo__max_features': ['sqrt', 'log2'],
'Modelo__max_depth' : [4,5,6,7,8],
'Modelo__criterion' :['gini', 'entropy']
}
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
gsr = GridSearchCV(estimator = pipeline, param_grid = params, scoring=scorer, cv = cv, n_jobs = -1)
gsr.fit(X_train, y_train)
# scorer: recall
print(f'Mejores parametros: {gsr.best_params_}')
print(f'Mejor score: {gsr.best_score_}')
Mejores parametros: {'Modelo__criterion': 'gini', 'Modelo__max_depth': 8, 'Modelo__max_features': 'sqrt', 'Modelo__n_estimators': 200}
Mejor score: 0.26929049640021985
CPU times: user 1.75 s, sys: 371 ms, total: 2.12 s
Wall time: 54.6 s
## Construyo modelo con los parametros tuneados
rf = RandomForestClassifier(
n_estimators = 200,
criterion = 'gini',
max_depth = 8,
max_features = 'sqrt'
)
rf.fit(X_train, y_train)
RandomForestClassifier(max_depth=8, n_estimators=200)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=8, n_estimators=200)
# Resultados en train
res_rftrain = reportes(rf, X_train, y_train, nombre = "rf train")
res_rftrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| rf train | 0.9482 | 0.6538 | 0.8269 | 1.0 | 0.7907 |
# Resultados en test
res_rftest = reportes(rf, X_test, y_test, nombre = "rf test")
res_rftest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| rf test | 0.8599 | 0.11 | 0.5506 | 0.6875 | 0.1897 |
# Muestra la matriz
confusion(rf, X_test, y_test)
%%time
# defining model
model = AdaBoostClassifier(random_state = seed)
# Transformador
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Standard scaler', scaler),
('Modelo', model)])
# Parameter grid to pass in GridSearchCV
param_grid = {
"Modelo__n_estimators": np.arange(10, 110, 10),
"Modelo__learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
}
cv = StratifiedKFold(n_splits=5, shuffle= True, random_state = seed)
# Calling GridSearchCV
grid_ab = GridSearchCV(estimator = pipeline, param_grid=param_grid, scoring=scorer, cv=cv, n_jobs = -1)
# Fitting parameters in GridSearchCV
grid_ab.fit(X_train, y_train)
# scorer: recall
print(f'Mejores parametros: {grid_ab.best_params_}')
print(f'Mejor score: {grid_ab.best_score_}')
Mejores parametros: {'Modelo__learning_rate': 1, 'Modelo__n_estimators': 60}
Mejor score: 0.4845072251128892
CPU times: user 612 ms, sys: 70.7 ms, total: 683 ms
Wall time: 8.3 s
# Se modela con los mejores parametros
ada = AdaBoostClassifier(
learning_rate = 1,
n_estimators = 60,
random_state = seed
)
ada.fit(X_train, y_train)
AdaBoostClassifier(learning_rate=1, n_estimators=60, random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(learning_rate=1, n_estimators=60, random_state=12345)
# Resultados en train
res_adatrain = reportes(ada, X_train, y_train, nombre = "ada train")
res_adatrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| ada train | 0.899 | 0.5085 | 0.7381 | 0.7346 | 0.601 |
# Resultados en test
res_adatest = reportes(ada, X_test, y_test, nombre = "ada test")
res_adatest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| ada test | 0.8763 | 0.39 | 0.6757 | 0.6393 | 0.4845 |
confusion(ada, X_test, y_test)
%%time
# definir modelo
model = GradientBoostingClassifier(random_state = seed)
# Transformador
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Standard scaler', scaler),
('Modelo', model)])
# grilla de parametros
param_grid={
"Modelo__n_estimators": np.arange(25,100,25),
"Modelo__learning_rate": [0.2,0.01, 0.05, 1],
"Modelo__subsample":[0.2,0.3,0.4,0.5],
"Modelo__max_features":[0.5,0.6,0.7,0.8],
"Modelo__max_depth":range(5,9,1),
'Modelo__min_samples_split':range(400,800,100)}
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
# llama a grid
grid_cv = GridSearchCV(estimator = pipeline, param_grid=param_grid, scoring=scorer, n_jobs = -1, cv=cv)
# Fit
grid_cv.fit(X_train, y_train)
print(f'Mejores parametros: {grid_cv.best_params_}')
print(f'Mejor score: {grid_cv.best_score_}')
Mejores parametros: {'Modelo__learning_rate': 0.2, 'Modelo__max_depth': 6, 'Modelo__max_features': 0.5, 'Modelo__min_samples_split': 400, 'Modelo__n_estimators': 75, 'Modelo__subsample': 0.5}
Mejor score: 0.485680711357207
CPU times: user 29.1 s, sys: 3.72 s, total: 32.8 s
Wall time: 4min 2s
# Se modela con los mejores parametros
gbm = GradientBoostingClassifier(
max_features = 0.5,
max_depth = 6,
min_samples_split = 400,
random_state = seed,
learning_rate = 0.2,
n_estimators = 75,
subsample = 0.5
)
gbm.fit(X_train, y_train)
GradientBoostingClassifier(learning_rate=0.2, max_depth=6, max_features=0.5,
min_samples_split=400, n_estimators=75,
random_state=12345, subsample=0.5)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GradientBoostingClassifier(learning_rate=0.2, max_depth=6, max_features=0.5,
min_samples_split=400, n_estimators=75,
random_state=12345, subsample=0.5)# Resultados en train
res_gbmtrain = reportes(gbm, X_train, y_train, nombre = "gbm train")
res_gbmtrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| gbm train | 0.9297 | 0.6154 | 0.8002 | 0.878 | 0.7236 |
# Resultados en test
res_gbmtest = reportes(gbm, X_test, y_test, nombre = "gbm test")
res_gbmtest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| gbm test | 0.8793 | 0.36 | 0.6651 | 0.6792 | 0.4706 |
confusion(gbm, X_test, y_test)
%%time
# Definir modelo
cbc = CatBoostClassifier(logging_level = 'Silent', random_state = seed)
# Transformador
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Quantile', scaler),
('Modelo', cbc)])
# Crear la grilla
grid = {'Modelo__max_depth': [3,4,5],'Modelo__n_estimators':[100, 200, 300],'Modelo__learning_rate' : [0.01, 0.05, 0.1]}
#cv
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
# Llama a grid search
gscv = GridSearchCV (estimator = pipeline, param_grid = grid, scoring = scorer, cv = cv, verbose = False)
#fit the model
gscv.fit(X_train,y_train)
# Resultados
print(f'Mejor estimador: {gscv.best_estimator_}')
print(f'Mejores parametros: {gscv.best_params_}')
print(f'Mejor score: {gscv.best_score_}')
Mejor estimador: Pipeline(steps=[('Quantile', QuantileTransformer()),
('Modelo',
<catboost.core.CatBoostClassifier object at 0x7fad98ca8100>)])
Mejores parametros: {'Modelo__learning_rate': 0.1, 'Modelo__max_depth': 4, 'Modelo__n_estimators': 300}
Mejor score: 0.48649255099505684
CPU times: user 44.9 s, sys: 25.3 s, total: 1min 10s
Wall time: 31.7 s
# Se modela con los mejores parametros
catboost = CatBoostClassifier(
max_depth = 4,
random_state = seed,
n_estimators = 200,
learning_rate = 0.1,
logging_level = 'Silent' #para que no salga print de iteraciones
)
catboost.fit(X_train, y_train,eval_set=(X_test, y_test),early_stopping_rounds = 10,use_best_model = True)
<catboost.core.CatBoostClassifier at 0x7f8078f7bb20>
# Resultados en train
res_cbtrain = reportes(catboost, X_train, y_train, nombre = "catboosting train")
res_cbtrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| catboosting train | 0.9118 | 0.4444 | 0.7192 | 0.9286 | 0.6012 |
# Resultados en test
res_cbtest = reportes(catboost, X_test, y_test, nombre = "catboosting test")
res_cbtest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| catboosting test | 0.8629 | 0.2 | 0.5895 | 0.625 | 0.303 |
confusion(catboost, X_test, y_test)
Este parametro es ideal para cuando existe un evidente desbalance en la distribucion de las clases
catboost1 = CatBoostClassifier(
max_depth = 4,
n_estimators = 200,
learning_rate = 0.1,
scale_pos_weight = 3,
random_state = seed,
logging_level = 'Silent' #para que no salga print de iteraciones
)
catboost1.fit(X_train, y_train,eval_set=(X_test, y_test),early_stopping_rounds = 30,use_best_model = True)
<catboost.core.CatBoostClassifier at 0x7f8049fab130>
# Resultados en train
res_cat1train = reportes(catboost1, X_train, y_train, nombre = "cat scale train")
res_cat1train
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| cat scale train | 0.9252 | 0.8034 | 0.875 | 0.7259 | 0.7627 |
# Resultados en test
res_cat1test = reportes(catboost1, X_test, y_test, nombre = "cat scale test")
res_cat1test
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| cat scale test | 0.8838 | 0.6 | 0.7667 | 0.6122 | 0.6061 |
confusion(catboost1, X_test, y_test)
lgbmc_base = LGBMClassifier(random_state = seed)
lgbmc_base.fit(X_train, y_train,eval_set=(X_test, y_test),feature_name = 'auto', verbose = 0)
LGBMClassifier(random_state=12345)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LGBMClassifier(random_state=12345)
# Resultados en train
res_lgbmtrain = reportes(lgbmc_base, X_train, y_train, nombre = "light gbm train")
res_lgbmtrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| light gbm train | 0.9955 | 0.9872 | 0.9921 | 0.983 | 0.9851 |
# Resultados en test
res_lgbmtest = reportes(lgbmc_base, X_test, y_test, nombre = "light gbm test")
res_lgbmtest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| light gbm test | 0.8763 | 0.37 | 0.6675 | 0.6491 | 0.4713 |
confusion(lgbmc_base, X_test, y_test)
Este parametro es ideal para cuando existe un evidente desbalance en la distribucion de las clases
lgbmc_3 = LGBMClassifier(random_state = seed, scale_pos_weight = 3)
lgbmc_3.fit(X_train, y_train,eval_set = (X_test, y_test),feature_name = 'auto', verbose=0)
LGBMClassifier(random_state=12345, scale_pos_weight=3)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LGBMClassifier(random_state=12345, scale_pos_weight=3)
# Resultados en train
res_lgbmtrain3 = reportes(lgbmc_3, X_train, y_train, nombre = "light3 train")
res_lgbmtrain3
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| light3 train | 0.9942 | 1.0 | 0.9966 | 0.963 | 0.9811 |
# Resultados en test
res_lgbmtest3 = reportes(lgbmc_3, X_test, y_test, nombre = "light3 test")
res_lgbmtest3
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| light3 test | 0.8703 | 0.39 | 0.6722 | 0.6 | 0.4727 |
confusion(lgbmc_3, X_test, y_test)
%%time
# XGBoost
# n_jobs=-1 to allow run it on all cores
model = XGBClassifier(random_state = seed)
# Transformador
scaler = QuantileTransformer()
# Pipeline
pipeline = Pipeline([('Quantile', scaler),
('Modelo', model)])
params = {
'Modelo__n_estimators': [100, 200, 500],
'Modelo__learning_rate': [0.01,0.05,0.1],
'Modelo__booster': ['gbtree', 'gblinear'],
'Modelo__reg_alpha': [0, 0.5, 1],
'Modelo__reg_lambda': [0.5, 1, 5],
'Modelo__base_score': [0.2, 0.5, 1]
}
#cv
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = seed)
xgb = GridSearchCV(pipeline, params, n_jobs =-1, cv = cv, scoring = scorer)
xgb.fit(X_train, y_train)
print('Best score:', xgb.best_score_)
print('Best score:', xgb.best_params_)
Best score: 0.48445779581013754
Best score: {'Modelo__base_score': 0.5, 'Modelo__booster': 'gbtree', 'Modelo__learning_rate': 0.1, 'Modelo__n_estimators': 100, 'Modelo__reg_alpha': 0, 'Modelo__reg_lambda': 0.5}
CPU times: user 7.66 s, sys: 961 ms, total: 8.63 s
Wall time: 1min 58s
# Se modela con los mejores parametros
xgbo = XGBClassifier(
base_score = 0.5,
booster = 'gbtree',
learning_rate = 0.1,
n_estimators = 100,
reg_alpha = 0,
reg_lambda = 0.5,
random_state = seed
)
xgbo.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.1, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=12345, ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
grow_policy='depthwise', importance_type=None,
interaction_constraints='', learning_rate=0.1, max_bin=256,
max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=12345, ...)# Resultados en train
res_xbtrain = reportes(xgbo, X_train, y_train, nombre = "Xgboost train")
res_xbtrain
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| Xgboost train | 0.9949 | 0.9701 | 0.9847 | 0.9956 | 0.9827 |
# Resultados en test
res_xbtest = reportes(xgbo, X_test, y_test, nombre = "Xgboost test")
res_xbtest
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| Xgboost test | 0.8733 | 0.32 | 0.6451 | 0.6531 | 0.4295 |
confusion(xgbo, X_test, y_test)
final = pd.concat([res_dttrain, res_dttest,res_rftrain, res_rftest, res_gbmtrain, res_gbmtest, res_cbtrain, res_cbtest, res_cat1train, res_cat1test,
res_adatrain, res_adatest,res_lgbmtrain, res_lgbmtest, res_lgbmtrain3, res_lgbmtest3, res_xbtrain, res_xbtest], axis = 0)
final
| Accuracy | Recall | Roc-Auc | Precision | F1 | |
|---|---|---|---|---|---|
| decision train | 0.8569 | 0.0513 | 0.5249 | 0.8571 | 0.0968 |
| decision test | 0.8525 | 0.0300 | 0.5132 | 0.6000 | 0.0571 |
| rf train | 0.9482 | 0.6538 | 0.8269 | 1.0000 | 0.7907 |
| rf test | 0.8599 | 0.1100 | 0.5506 | 0.6875 | 0.1897 |
| gbm train | 0.9297 | 0.6154 | 0.8002 | 0.8780 | 0.7236 |
| gbm test | 0.8793 | 0.3600 | 0.6651 | 0.6792 | 0.4706 |
| catboosting train | 0.9118 | 0.4444 | 0.7192 | 0.9286 | 0.6012 |
| catboosting test | 0.8629 | 0.2000 | 0.5895 | 0.6250 | 0.3030 |
| cat scale train | 0.9252 | 0.8034 | 0.8750 | 0.7259 | 0.7627 |
| cat scale test | 0.8838 | 0.6000 | 0.7667 | 0.6122 | 0.6061 |
| ada train | 0.8990 | 0.5085 | 0.7381 | 0.7346 | 0.6010 |
| ada test | 0.8763 | 0.3900 | 0.6757 | 0.6393 | 0.4845 |
| light gbm train | 0.9955 | 0.9872 | 0.9921 | 0.9830 | 0.9851 |
| light gbm test | 0.8763 | 0.3700 | 0.6675 | 0.6491 | 0.4713 |
| light3 train | 0.9942 | 1.0000 | 0.9966 | 0.9630 | 0.9811 |
| light3 test | 0.8703 | 0.3900 | 0.6722 | 0.6000 | 0.4727 |
| Xgboost train | 0.9949 | 0.9701 | 0.9847 | 0.9956 | 0.9827 |
| Xgboost test | 0.8733 | 0.3200 | 0.6451 | 0.6531 | 0.4295 |
final_test = pd.concat([res_dttest,res_rftest, res_gbmtest,res_cbtest, res_cat1test,
res_adatest, res_lgbmtest, res_lgbmtest3, res_xbtest], axis = 0)
# Performance test
final_test.sort_values(by = ['F1'], ascending=True,inplace=True)
px.bar(final_test, x ='F1', y = final_test.index,title = 'Comparacion modelos test set', height = 500,labels={'index':'Modelos'})
feature_names = X.columns
importances = catboost1.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Importancia de variables")
plt.barh(range(len(indices)), importances[indices], color="lightblue", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Importancia relativa")
plt.show()
# Guarda el modelo
pickle.dump(catboost1, open('../web/model.pkl','wb'))
# #Learning curve
# estimador = catboost1
# train_sizes, train_scores, test_scores = learning_curve(
# estimator = estimador,
# X = X,
# y = y,
# cv=5,
# scoring="recall",
# train_sizes = [100, 500, 1000, 1300]
# )
# train_mean = -train_scores.mean(axis=1)
# test_mean = -test_scores.mean(axis=1)
# plt.subplots(figsize=(10,8))
# plt.plot(train_sizes, train_mean, label="train")
# plt.plot(train_sizes, test_mean, label="validation")
# plt.title("Learning Curve")
# plt.xlabel("Training Set Size")
# plt.ylabel("Recall")
# plt.legend(loc="best")
# plt.show()